Main QC plots
VlnPlot(ds_c, features = c('nCount_RNA','nFeature_RNA', 'percent_mt', 'percent_hb', "percent_ribo", "percent_plat"), pt.size = 1.3, ncol = 1) + NoLegend()
This script identifies lineages of EVs among cells from pleural effusion samples.
#install.packages("Seurat")
#install.packages("remotes")
#BiocManager::install(version = '3.16')
#BiocManager::install("glmGamPoi")
#remotes::install_github("stephenturner/annotables")
library(Seurat)
library(dplyr)
library(data.table)
library(Matrix)
library(ggplot2)
library(ggpubr)
library(future)
library(annotables) # for turning Ensembl ID to symbol
library(sctransform) # for normalization
#library(glmGamPoi) # for normalization
"%notin%" <- Negate("%in%")
"%notlike%" <- Negate("%like%")
theme_set(new = theme_classic())
theme_update(
axis.text.x = element_text(vjust = 0.5),
strip.background = element_rect(fill = '#FFFFFF'),
plot.title = element_text(hjust = 0.5, size = 25),
axis.title = element_text(size = 23),
axis.text = element_text(size = 20),
legend.text = element_text(size = 18),
legend.key.size = unit(2, 'line'),
legend.title = element_text(size = 20)
)
wd <- "/disk2/user/radgro/projects/2023-06_pesc_analysis"
knitr::opts_knit$set(root.dir = wd)
# plan("multicore", workers = 8)
# plan()
# A demonstration of advantages of a sparse matrix ("d" for data).
d_dc <- as.matrix(read.csv("data/DC_matrix.txt", sep = "\t"))
s1 <- object.size(d_dc)
d_dc <- as(d_dc, "sparseMatrix")
s2 <- object.size(d_dc)
s1-s2
# for EVs
d_dc <- as.sparse(read.csv("data/DC_matrix.txt", sep = "\t"))
# for cells
d_c <- as.sparse(read.csv("data/PEsc_matrix.txt", sep = "\t"))
# Gene names
gt <- fread("data/genes_title.txt", sep = "\t")
# get gene symbols from annotables
gt_s <- setDT(grch38[, c("ensgene", "symbol")])
colnames(gt_s)[1] <- "Geneid"
Join gene names with gene symbols and change symbols for Geneid when the grch38 doesn’t include any particular ID, mult set to ‘first’, to exclude duplicates/synonyms.
gt_mito <- readLines("code_for_daniel/human_mitochondrial_genes_list.txt")[-1] # Could be used for Daniel's approach
gt_join[Geneid %in% gt_mito] # same genes are mitochondrial here and in Daniel's table - all is fine
Insert joined gene names and symbols into datasets.
rownames(d_c) <- gt_sym
rownames(d_dc) <- gt_sym
ds_c <- CreateSeuratObject(count = d_c, min.cells = 0, min.features = 1, project = "cells")
## Warning: Non-unique features (rownames) present in the input matrix, making
## unique
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
rm(d_c)
suppressMessages(gc())
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3309810 176.8 6004982 320.8 4034992 215.5
## Vcells 21862982 166.9 236501590 1804.4 293749668 2241.2
ds_c <- PercentageFeatureSet(ds_c, pattern = "^MT-", col.name = "percent_mt")
ds_c <- PercentageFeatureSet(ds_c, "^RP[SL]", col.name = "percent_ribo")
ds_c <- PercentageFeatureSet(ds_c, "^HB[^(P)]", col.name = "percent_hb")
ds_c <- PercentageFeatureSet(ds_c, "PECAM1|PF4", col.name = "percent_plat")
VlnPlot(ds_c, features = c('nCount_RNA','nFeature_RNA', 'percent_mt', 'percent_hb', "percent_ribo", "percent_plat"), pt.size = 1.3, ncol = 1) + NoLegend()
FeatureScatter(ds_c, "nCount_RNA", "nFeature_RNA", pt.size = 1, plot.cor = F) + scale_x_continuous(labels = scales::scientific) + NoLegend()
How many cells have more than 800 RNAs or more than 1000 RNAs and genes found.
length(WhichCells(ds_c, expression = nCount_RNA > 800))
## [1] 1765
length(WhichCells(ds_c, expression = nCount_RNA > 1000 & nFeature_RNA > 600))
## [1] 1622
Filter out cells that have <=1000 features and UMIs as they are the only ones You can work with (subsetting on the percentages of different transcript types can be done later).
The percent of ribosomal genes might be low due to low transcriptional activity of the cells (5%, which was a threshold used before, removes too many cells).
ds_cf <- subset(x = ds_c, subset = nCount_RNA > 1000 & nFeature_RNA > 600 & percent_mt < 25 & percent_ribo > .5
& percent_hb < 5)
rm(ds_c)
suppressMessages(gc())
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3528201 188.5 6004982 320.8 6004982 320.8
## Vcells 30151676 230.1 189201272 1443.5 293749668 2241.2
C <- ds_cf@assays$RNA@counts
C <- Matrix::t(Matrix::t(C)/Matrix::colSums(C)) * 100
most_expressed <- order(apply(C, 1, median), decreasing = T)[20:1]
par(mar=c(5,10,1,1))
boxplot(as.matrix(t(C[most_expressed,])), cex = 1, las = 1, xlab = "% total count per cell",
col = (scales::hue_pal())(20)[20:1], horizontal = TRUE)
VlnPlot(ds_cf, features = c('percent_mt', "percent_ribo"))
VlnPlot(ds_cf, features = c('nFeature_RNA','nCount_RNA'), log = T) ## UGLY
p1 <- ggplot(as.data.table(ds_cf$nFeature_RNA)) +
geom_violin(aes(x = "BM", y = V1)) +
geom_jitter(aes(x = "BM", y = V1), size = .9, alpha = .4) +
scale_y_log10() +
ggtitle('Genes') +
theme(plot.title = element_text(size = 18)) +
xlab("") +
ylab("count")
p2 <- ggplot(as.data.table(ds_cf$nCount_RNA)) +
geom_violin(aes(x = "BM", y = V1)) +
geom_jitter(aes(x = "BM", y = V1), size = .9, alpha = .4) +
scale_y_log10() +
ggtitle('RNAs') +
theme(plot.title = element_text(size = 18)) +
xlab("") +
ylab("")
p3 <- ggplot(as.data.table(ds_cf$percent_mt)) +
geom_violin(aes(x = "BM", y = V1)) +
geom_jitter(aes(x = "BM", y = V1), size = .9, alpha = .4) +
ggtitle('Percent of\nmitochondrial') +
theme(plot.title = element_text(size = 18)) +
xlab("") +
ylab("percent")
p4 <- ggplot(as.data.table(ds_cf$percent_ribo)) +
geom_violin(aes(x = "BM", y = V1)) +
geom_jitter(aes(x = "BM", y = V1), size = .9, alpha = .4) +
ggtitle('Percent of\nribosomal') +
theme(plot.title = element_text(size = 18)) +
xlab("") +
ylab("")
p5 <- ggplot(as.data.table(ds_cf$percent_hb)) +
geom_violin(aes(x = "BM", y = V1)) +
geom_jitter(aes(x = "BM", y = V1), size = .9, alpha = .4) +
ggtitle('Percent of\nhemoglobin') +
theme(plot.title = element_text(size = 18)) +
xlab("") +
ylab("")
p6 <- ggplot(as.data.table(ds_cf$percent_plat)) +
geom_violin(aes(x = "BM", y = V1)) +
geom_jitter(aes(x = "BM", y = V1), size = .9, alpha = .4) +
ggtitle('Percent of\nplatelet') +
theme(plot.title = element_text(size = 18)) +
xlab("") +
ylab("")
ggarrange(p1, p2, p3, p4, p5, p6, nrow = 1)
# ggarrange(ggarrange(p1, p2),
# ggarrange(p3, p4, p5, p6, ncol = 2, nrow = 2, widths = 2),
# nrow = 2)
#ggsave("qc_violins_filt_p3.png", qcplots, device = 'png', dpi = "retina", width = 21, height = 12, bg = "white")
ds_dc <- CreateSeuratObject(count = d_dc, min.cells = 0, min.features = 1, project = "ev")
## Warning: Non-unique features (rownames) present in the input matrix, making
## unique
## Warning: Feature names cannot have underscores ('_'), replacing with dashes
## ('-')
rm(d_dc)
suppressMessages(gc())
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3465345 185.1 6400157 341.9 6400157 341.9
## Vcells 64183119 489.7 284404267 2169.9 309787115 2363.5
ds_dc <- PercentageFeatureSet(ds_dc, pattern = "^MT-", col.name = "percent_mt")
ds_dc <- PercentageFeatureSet(ds_dc, "^RP[SL]", col.name = "percent_ribo")
ds_dc <- PercentageFeatureSet(ds_dc, "^HB[^(P)]", col.name = "percent_hb")
ds_dc <- PercentageFeatureSet(ds_dc, "PECAM1|PF4", col.name = "percent_plat")
VlnPlot(ds_dc, features = c('nCount_RNA','nFeature_RNA', 'percent_mt', 'percent_hb', "percent_ribo", "percent_plat"), pt.size = 1.3, ncol = 1) + NoLegend()
FeatureScatter(ds_dc, "nCount_RNA", "nFeature_RNA", pt.size = 1, plot.cor = F) +
scale_x_continuous(labels = scales::scientific) +
scale_y_continuous(labels = scales::scientific)
___ ## Filtering
length(WhichCells(ds_dc, expression = nFeature_RNA > 100))
## [1] 569
length(WhichCells(ds_dc, expression = nFeature_RNA > 100 & nCount_RNA > 500))
## [1] 541
# most common genes for cells:
sort(Matrix::rowSums(ds_cf), decreasing = T)[1:10]
## MT-RNR2 MALAT1 ACTB MT-RNR1 CD74 MT-ND4 MT-CYB FTL B2M PSAP
## 1142527 795294 738633 664680 579049 568926 517594 490462 480441 432383
# most common genes for EVs
sort(Matrix::rowSums(ds_dc), decreasing = T)[1:10]
## HELLPAR MT-RNR2 ENSG00000279010 ENSG00000280156 ENSG00000279184
## 72147 57170 55177 47776 36823
## MT-RNR1 ENSG00000279080 ENSG00000279738 MT-ND5 MT-ND4
## 34482 29941 28346 23232 23109
ds_dcf <- subset(x = ds_dc, subset = nCount_RNA > 500 & nFeature_RNA > 100 & percent_mt < 90 & percent_ribo < 90)
rm(ds_dc)
suppressMessages(gc())
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3503357 187.1 6400157 341.9 6400157 341.9
## Vcells 70315463 536.5 227523414 1735.9 309787115 2363.5
C <- ds_dcf@assays$RNA@counts
C <- Matrix::t(Matrix::t(C)/Matrix::colSums(C)) * 100
most_expressed <- order(apply(C, 1, median), decreasing = T)[20:1]
par(mar=c(5,10,1,1))
boxplot(as.matrix(t(C[most_expressed,])), cex = 1, las = 1, xlab = "% total count per fraction",
col = (scales::hue_pal())(20)[20:1], horizontal = TRUE)
Logarithmic axes.
VlnPlot(ds_dcf, features = 'nFeature_RNA', pt.size = 1.3, log = T) + NoLegend()
VlnPlot(ds_dcf, features = 'nCount_RNA', pt.size = 1.3, log = T) + NoLegend()
knitr::knit_exit()